library(readr)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(car)
## Loading required package: carData
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-7
library(caret)
## Loading required package: lattice
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(gvlma)
library(rsq)
library(readr)
library(readr)
library(MVA)
## Loading required package: HSAUR2
## Loading required package: tools
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:Metrics':
##
## auc
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(HSAUR2)
library(SciViews)
library(scatterplot3d)
library(car)
library(lattice)
library(GGally)
library(caTools)
library(ggplot2)
library(ggridges)
library(ggvis)
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:Matrix':
##
## band
## The following object is masked from 'package:ggplot2':
##
## resolution
library(ggcorrplot)
library(ggthemes)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
library(gapminder)
library(gganimate)
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session
##
## Attaching package: 'gganimate'
## The following object is masked from 'package:ggvis':
##
## view_static
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ tidyr 1.3.0 ✔ forcats 1.0.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks car::recode()
## ✖ ggvis::resolution() masks ggplot2::resolution()
## ✖ purrr::some() masks car::some()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(RColorBrewer)
library(Hotelling)
## Loading required package: corpcor
##
## Attaching package: 'Hotelling'
##
## The following object is masked from 'package:dplyr':
##
## summarise
library(stats)
library(biotools)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## ---
## biotools version 4.2
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(ggfortify)
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:car':
##
## logit
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(cluster)
library(corrplot)
## corrplot 0.92 loaded
library(caret)
library(MASS)
library(ggplot2)
library(memisc)
##
## Attaching package: 'memisc'
##
## The following object is masked from 'package:purrr':
##
## %@%
##
## The following object is masked from 'package:tibble':
##
## view
##
## The following objects are masked from 'package:dplyr':
##
## collect, recode, rename, syms
##
## The following object is masked from 'package:Matrix':
##
## as.array
##
## The following object is masked from 'package:car':
##
## recode
##
## The following object is masked from 'package:ggplot2':
##
## syms
##
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
##
## The following object is masked from 'package:base':
##
## as.array
library(ROCR)
library(dplyr)
library(klaR)
library(readxl)
abalone <- read_excel("~/Documents/Courses/Multivariate Analysis/Final/Final_MVA_2023.xlsx")
summary(abalone)
## Gender Length Diameter Height
## Length:2835 Min. :0.1550 Min. :0.1100 Min. :0.0150
## Class :character 1st Qu.:0.5150 1st Qu.:0.4000 1st Qu.:0.1350
## Mode :character Median :0.5850 Median :0.4600 Median :0.1550
## Mean :0.5696 Mean :0.4464 Mean :0.1544
## 3rd Qu.:0.6350 3rd Qu.:0.5000 3rd Qu.:0.1750
## Max. :0.8150 Max. :0.6500 Max. :1.1300
## Whole Weight Shucked Weight Viscera weight Shell weight
## Min. :0.0155 Min. :0.0065 Min. :0.0030 Min. :0.0050
## 1st Qu.:0.7013 1st Qu.:0.2870 1st Qu.:0.1520 1st Qu.:0.2025
## Median :1.0030 Median :0.4315 Median :0.2170 Median :0.2850
## Mean :1.0168 Mean :0.4391 Mean :0.2225 Mean :0.2912
## 3rd Qu.:1.2895 3rd Qu.:0.5687 3rd Qu.:0.2875 3rd Qu.:0.3650
## Max. :2.8255 Max. :1.4880 Max. :0.7600 Max. :1.0050
## Rings
## Min. : 3.0
## 1st Qu.: 9.0
## Median :10.0
## Mean :10.9
## 3rd Qu.:12.0
## Max. :29.0
abalone$Age <- abalone$Rings + 1.5
View(abalone)
colSums(is.na(abalone))
## Gender Length Diameter Height Whole Weight
## 0 0 0 0 0
## Shucked Weight Viscera weight Shell weight Rings Age
## 0 0 0 0 0
Creating dummy vars for gender
dummy_df <- dummyVars("~ Gender", data = abalone)
# Apply the dummy variables to the original data frame
transformed_df <- predict(dummy_df, newdata = abalone[1])
# View the transformed data frame
#head(transformed_df)
df <- cbind(abalone, transformed_df)
df = df[, -1]
View(df)
Correlation plot
ggpairs(data=df, title="Abalone")
What attributes proved valuable in predicting the age? 1. Shell weight 2. Height 3. Diameter These values are the most valuable for predicting age. There is also rings with a 1.00 correlation because it is just rings + 1.5 so that does not count.
Taking log of the data as the data is non linear in nature
ablog <- abalone
# Take the log of the dataset
ablog$Length <- log(ablog$Length)
ablog$Diameter <- log(ablog$Diameter)
ablog$Height <- log(ablog$Height)
ablog$`Whole Weight` <- log(ablog$`Whole Weight`)
ablog$`Shucked Weight` <- log(ablog$`Shucked Weight`)
ablog$`Viscera weight` <- log(ablog$`Viscera weight`)
ablog$`Shell weight` <- log(ablog$`Shell weight`)
ablog$Age <- log(ablog$Age)
ablog$Rings <- log(ablog$Rings)
summary(ablog)
## Gender Length Diameter Height
## Length:2835 Min. :-1.8643 Min. :-2.2073 Min. :-4.1997
## Class :character 1st Qu.:-0.6636 1st Qu.:-0.9163 1st Qu.:-2.0025
## Mode :character Median :-0.5361 Median :-0.7765 Median :-1.8643
## Mean :-0.5797 Mean :-0.8253 Mean :-1.8948
## 3rd Qu.:-0.4541 3rd Qu.:-0.6931 3rd Qu.:-1.7430
## Max. :-0.2046 Max. :-0.4308 Max. : 0.1222
## Whole Weight Shucked Weight Viscera weight Shell weight
## Min. :-4.166915 Min. :-5.0360 Min. :-5.8091 Min. :-5.298317
## 1st Qu.:-0.354891 1st Qu.:-1.2483 1st Qu.:-1.8839 1st Qu.:-1.597018
## Median : 0.002996 Median :-0.8405 Median :-1.5279 Median :-1.255266
## Mean :-0.116366 Mean :-0.9754 Mean :-1.6411 Mean :-1.358663
## 3rd Qu.: 0.254255 3rd Qu.:-0.5643 3rd Qu.:-1.2465 3rd Qu.:-1.007858
## Max. : 1.038685 Max. : 0.3974 Max. :-0.2744 Max. : 0.004988
## Rings Age
## Min. :1.099 Min. :1.504
## 1st Qu.:2.197 1st Qu.:2.351
## Median :2.303 Median :2.442
## Mean :2.353 Mean :2.490
## 3rd Qu.:2.485 3rd Qu.:2.603
## Max. :3.367 Max. :3.418
colSums(is.na(ablog))
## Gender Length Diameter Height Whole Weight
## 0 0 0 0 0
## Shucked Weight Viscera weight Shell weight Rings Age
## 0 0 0 0 0
dummy_df_log <- dummyVars("~ Gender", data = ablog)
# Apply the dummy variables to the original data frame
transformed_df_log <- predict(dummy_df_log, newdata = ablog[1])
# View the transformed data frame
head(transformed_df_log)
## GenderF GenderM
## 1 0 1
## 2 0 1
## 3 1 0
## 4 0 1
## 5 1 0
## 6 1 0
df_log <- cbind(ablog, transformed_df_log)
ggpairs(data=df_log, aes(colour = Gender, alpha = 0.8), title="Pairs plot for abalone dataset") +
theme_grey(base_size = 8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df_log = df_log[, -1]
Train test split
set.seed(42) # set a seed value for reproducibility
trainIndexlog <- createDataPartition(df_log$Rings, p = 0.7, list = FALSE)
trainDatalog <- df_log[trainIndexlog, ]
testDatalog <- df_log[-trainIndexlog, ]
abalone.reg.log <- lm(Rings~Length + Diameter + Height + `Whole Weight` + `Shucked Weight` + `Viscera weight` + `Shell weight` + GenderF + GenderM, data=trainDatalog)
summary(abalone.reg.log)
##
## Call:
## lm(formula = Rings ~ Length + Diameter + Height + `Whole Weight` +
## `Shucked Weight` + `Viscera weight` + `Shell weight` + GenderF +
## GenderM, data = trainDatalog)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53996 -0.12533 -0.01387 0.10732 0.72066
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.986736 0.122958 16.158 < 2e-16 ***
## Length -0.375454 0.135945 -2.762 0.005801 **
## Diameter 0.152652 0.121675 1.255 0.209778
## Height 0.059538 0.034662 1.718 0.086010 .
## `Whole Weight` 0.859043 0.087705 9.795 < 2e-16 ***
## `Shucked Weight` -0.791485 0.043552 -18.173 < 2e-16 ***
## `Viscera weight` -0.117463 0.031819 -3.692 0.000229 ***
## `Shell weight` 0.348638 0.041921 8.317 < 2e-16 ***
## GenderF -0.004024 0.008590 -0.468 0.639517
## GenderM NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1885 on 1977 degrees of freedom
## Multiple R-squared: 0.4955, Adjusted R-squared: 0.4934
## F-statistic: 242.7 on 8 and 1977 DF, p-value: < 2.2e-16
predictionslog <- predict(abalone.reg.log, newdata=testDatalog)
## Warning in predict.lm(abalone.reg.log, newdata = testDatalog): prediction from
## a rank-deficient fit may be misleading
RMSE(predictionslog, testDatalog[, 8])
## [1] 0.1928701
y_pred <- exp(predictionslog)
y_actual <- exp(testDatalog[, 8])
ape <- abs((y_actual - y_pred) / y_actual) * 100
mape <- mean(ape, na.rm = TRUE)
cat("MAPE:", round(mape, 2), "%")
## MAPE: 15.03 %
plot(abalone.reg.log, which = 1)
plot(abalone.reg.log, which = 2) #Normal Q-Q PLOT
hist(abalone.reg.log$residuals)
Accuracy of the model (MAPE) is 15.03% This accuracy is lower because I have taken log transformation of the entire dataset since it was non linear The model is not a perfect normal disttribution. residual vs fitted plot shows some pattern in the data, as the residuals are increasing as the fitted values are increasing. The qq plot has a tail which indicates that there are some errors. there is scope to improve this model.
Logistic Regression
ggpairs(data=abalone, aes(colour = Gender, alpha = 0.8), title="Pairs plot for abalone dataset") +
theme_grey(base_size = 8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Train test split
set.seed(42) # set a seed value for reproducibility
abalone$Sex_binary <- ifelse(abalone$Gender == "M", 1, 0)
abalone$Sex_binary <- as.factor(abalone$Sex_binary)
train_index1 <- sample(1:nrow(abalone), nrow(abalone)*0.7)
train_data1 <- abalone[train_index1, ]
test_data1 <- abalone[-train_index1, ]
Model
logit_model <- glm(Sex_binary ~ Length + Diameter + Height + `Whole Weight` + `Shucked Weight` + `Viscera weight` + `Shell weight` + Rings, data=train_data1, family=binomial)
Predict
test_data1$predicted_sex <- predict(logit_model, newdata=test_data1, type="response")
test_data1$predicted_sex_binary <- ifelse(test_data1$predicted_sex > 0.5, 1, 0)
Evaluate model
confusionMatrix(as.factor(test_data1$predicted_sex_binary), test_data1$Sex_binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 142 127
## 1 251 331
##
## Accuracy : 0.5558
## 95% CI : (0.5217, 0.5895)
## No Information Rate : 0.5382
## P-Value [Acc > NIR] : 0.1594
##
## Kappa : 0.086
##
## Mcnemar's Test P-Value : 2.509e-10
##
## Sensitivity : 0.3613
## Specificity : 0.7227
## Pos Pred Value : 0.5279
## Neg Pred Value : 0.5687
## Prevalence : 0.4618
## Detection Rate : 0.1669
## Detection Prevalence : 0.3161
## Balanced Accuracy : 0.5420
##
## 'Positive' Class : 0
##
precision(test_data1$predicted_sex_binary, test_data1$Sex_binary)
## [1] 0.7227074
recall(test_data1$predicted_sex_binary, test_data1$Sex_binary)
## Warning in mean.default(predicted[actual == 1]): argument is not numeric or
## logical: returning NA
## [1] NA
roc <- roc(test_data1$Gender, test_data1$predicted_sex_binary)
## Setting levels: control = F, case = M
## Setting direction: controls < cases
plot(roc, main = "ROC curve", print.auc = TRUE)
The confusion matrix shows that the model predicted 142 cases as 0 and the actual value was also 0. The model predicted 127 cases as 1, but the actual value was 0. The model predicted 251 cases as 0, but the actual value was 1. Finally, the model predicted 331 cases as 1, and the actual value was also 1.
The accuracy is 55.58% of the logistic regression model. Sensitivity
: 0.3613
Specificity : 0.7227
AUC: 0.54 We can predict the gender but it is barely better than a coin
flip.
# Build an LDA model
lda_model <- lda(Sex_binary ~ . - Sex_binary, data = train_data1[, 2:11])
## Warning in lda.default(x, grouping, ...): variables are collinear
# Check the model summary
summary(lda_model)
## Length Class Mode
## prior 2 -none- numeric
## counts 2 -none- numeric
## means 18 -none- numeric
## scaling 9 -none- numeric
## lev 2 -none- character
## svd 1 -none- numeric
## N 1 -none- numeric
## call 3 -none- call
## terms 3 terms call
## xlevels 0 -none- list
# Make predictions on the test set
lda_pred <- predict(lda_model, newdata = test_data1[, 2:11])
lda_pred <- lda_pred$class
# Calculate accuracy measures
table(lda_pred, test_data1$Sex_binary)
##
## lda_pred 0 1
## 0 139 121
## 1 254 337
acc <- sum(lda_pred == test_data1$Sex_binary)/length(test_data1$Sex_binary)
acc
## [1] 0.559342
The accuracy is 55.93% It is barely better than the logistic regression model.
Logistic regression using PCA
pca <- prcomp(abalone[,2:9],scale=TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4923 0.9697 0.6431 0.42965 0.35503 0.30635 0.14437
## Proportion of Variance 0.7764 0.1175 0.0517 0.02307 0.01576 0.01173 0.00261
## Cumulative Proportion 0.7764 0.8940 0.9457 0.96874 0.98450 0.99623 0.99884
## PC8
## Standard deviation 0.09646
## Proportion of Variance 0.00116
## Cumulative Proportion 1.00000
fviz_eig(pca, addlabels = TRUE)
I am choosing 3 PC components after looking at the scree plot.
pca <- as.data.frame(pca$x)
pca <- pca[1:3]
pca$Gender <- abalone$Sex_binary
Train test split
set.seed(42) # set a seed value for reproducibility
train_index2 <- sample(1:nrow(pca), nrow(pca)*0.7)
train_data2 <- pca[train_index2, ]
test_data2 <- pca[-train_index2, ]
Model
logit_model2 <- glm(Gender ~ PC1 + PC2 + PC3, data=train_data2, family=binomial)
Predict
test_data2$predicted_sex <- predict(logit_model2, newdata=test_data2, type="response")
test_data2$predicted_sex_binary <- ifelse(test_data2$predicted_sex > 0.5, 1, 0)
Evaluate model
confusionMatrix(as.factor(test_data2$predicted_sex_binary), test_data2$Gender)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 93 97
## 1 300 361
##
## Accuracy : 0.5335
## 95% CI : (0.4993, 0.5674)
## No Information Rate : 0.5382
## P-Value [Acc > NIR] : 0.6218
##
## Kappa : 0.0258
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.2366
## Specificity : 0.7882
## Pos Pred Value : 0.4895
## Neg Pred Value : 0.5461
## Prevalence : 0.4618
## Detection Rate : 0.1093
## Detection Prevalence : 0.2233
## Balanced Accuracy : 0.5124
##
## 'Positive' Class : 0
##
precision(test_data2$predicted_sex_binary, test_data2$Gender)
## [1] 0.7882096
recall(test_data2$predicted_sex_binary, test_data2$Gender)
## Warning in mean.default(predicted[actual == 1]): argument is not numeric or
## logical: returning NA
## [1] NA
roc1 <- roc(test_data2$Gender, test_data2$predicted_sex_binary)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc1, main = "ROC curve", print.auc = TRUE)
We are getting worse accuracy with PCA.
Logistic Regression using FA
fit.pc <- principal(abalone[, 2:9], nfactors=4, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = abalone[, 2:9], nfactors = 4, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC3 RC2 RC4 h2 u2 com
## Length 0.82 0.29 0.15 0.45 0.99 0.01175 1.9
## Diameter 0.81 0.30 0.18 0.46 0.99 0.01016 2.0
## Height 0.44 0.87 0.17 0.12 1.00 0.00051 1.6
## Whole Weight 0.93 0.30 0.17 0.10 0.99 0.00871 1.3
## Shucked Weight 0.93 0.27 0.00 0.09 0.95 0.05381 1.2
## Viscera weight 0.91 0.29 0.12 0.06 0.93 0.06599 1.2
## Shell weight 0.81 0.33 0.35 0.12 0.91 0.09080 1.8
## Rings 0.13 0.12 0.98 0.05 0.99 0.00831 1.1
##
## RC1 RC3 RC2 RC4
## SS loadings 4.76 1.31 1.21 0.47
## Proportion Var 0.60 0.16 0.15 0.06
## Cumulative Var 0.60 0.76 0.91 0.97
## Proportion Explained 0.61 0.17 0.16 0.06
## Cumulative Proportion 0.61 0.78 0.94 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 4 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.01
## with the empirical chi square 26.47 with prob < 1.8e-06
##
## Fit based upon off diagonal values = 1
round(fit.pc$values, 3)
## [1] 6.212 0.940 0.414 0.185 0.126 0.094 0.021 0.009
fit.pc$loadings
##
## Loadings:
## RC1 RC3 RC2 RC4
## Length 0.824 0.294 0.151 0.448
## Diameter 0.806 0.305 0.182 0.462
## Height 0.444 0.872 0.171 0.117
## Whole Weight 0.927 0.303 0.174
## Shucked Weight 0.931 0.267
## Viscera weight 0.912 0.290 0.121
## Shell weight 0.814 0.327 0.352 0.125
## Rings 0.125 0.119 0.979
##
## RC1 RC3 RC2 RC4
## SS loadings 4.761 1.307 1.213 0.469
## Proportion Var 0.595 0.163 0.152 0.059
## Cumulative Var 0.595 0.759 0.910 0.969
# Loadings with more digits
for (i in c(1,3,2,4)) { print(fit.pc$loadings[[1,i]])}
## [1] 0.8235491
## [1] 0.1508324
## [1] 0.293559
## [1] 0.4484296
# Communalities
fit.pc$communality
## Length Diameter Height Whole Weight Shucked Weight
## 0.9882495 0.9898375 0.9994898 0.9912878 0.9461900
## Viscera weight Shell weight Rings
## 0.9340059 0.9092005 0.9916902
# Rotated factor scores, Notice the columns ordering: RC1, RC3, RC2 and RC4
# Play with FA utilities
fa.parallel(abalone[, 2:9]) # See factor recommendation
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Parallel analysis suggests that the number of factors = 3 and the number of components = 1
fa.plot(fit.pc) # See Correlations within Factors
fa.diagram(fit.pc) # Visualize the relationship
vss(abalone[, 2:9]) # See Factor recommendations for a simple structure
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
##
## Very Simple Structure
## Call: vss(x = abalone[, 2:9])
## VSS complexity 1 achieves a maximimum of 0.97 with 1 factors
## VSS complexity 2 achieves a maximimum of 0.99 with 2 factors
##
## The Velicer MAP achieves a minimum of 0.12 with 1 factors
## BIC achieves a minimum of 166.96 with 4 factors
## Sample Size adjusted BIC achieves a minimum of 173.31 with 4 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex
## 1 0.97 0.00 0.12 20 7.7e+03 0.0e+00 1.19 0.97 0.37 7555 7618 1.0
## 2 0.94 0.99 0.15 13 5.4e+03 0.0e+00 0.46 0.99 0.38 5316 5358 1.2
## 3 0.91 0.97 0.24 7 5.8e+02 3.1e-121 0.45 0.99 0.17 525 548 1.5
## 4 0.88 0.94 0.22 2 1.8e+02 2.0e-40 0.48 0.99 0.18 167 173 1.7
## 5 0.86 0.93 0.34 -2 1.4e+01 NA 0.47 0.99 NA NA NA 1.9
## 6 0.87 0.94 0.45 -5 2.1e+00 NA 0.43 0.99 NA NA NA 1.9
## 7 0.86 0.93 1.00 -7 1.4e-04 NA 0.43 0.99 NA NA NA 1.9
## 8 0.86 0.93 NA -8 1.4e-04 NA 0.43 0.99 NA NA NA 1.9
## eChisq SRMR eCRMS eBIC
## 1 3.7e+02 4.9e-02 0.0575 216
## 2 4.7e+01 1.7e-02 0.0252 -57
## 3 1.6e+00 3.2e-03 0.0064 -54
## 4 5.3e-01 1.8e-03 0.0069 -15
## 5 1.2e-02 2.8e-04 NA NA
## 6 8.7e-04 7.4e-05 NA NA
## 7 5.9e-08 6.1e-07 NA NA
## 8 5.9e-08 6.1e-07 NA NA
fit.pc <- as.data.frame(fit.pc$scores)
fit.pc <- fit.pc[1:2]
fit.pc$Gender <- abalone$Sex_binary
fit.pc$Rings <- abalone$Rings
Train test split
set.seed(42) # set a seed value for reproducibility
train_index3 <- sample(1:nrow(fit.pc), nrow(fit.pc)*0.7)
train_data3 <- fit.pc[train_index3, ]
test_data3 <- fit.pc[-train_index3, ]
Model
logit_model3 <- glm(Gender ~ RC1 + RC3, data=train_data3, family=binomial)
Predict
test_data3$predicted_sex <- predict(logit_model3, newdata=test_data3, type="response")
test_data3$predicted_sex_binary <- ifelse(test_data3$predicted_sex > 0.5, 1, 0)
Evaluate model
confusionMatrix(as.factor(test_data3$predicted_sex_binary), test_data3$Gender)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 56 62
## 1 337 396
##
## Accuracy : 0.5311
## 95% CI : (0.497, 0.5651)
## No Information Rate : 0.5382
## P-Value [Acc > NIR] : 0.6728
##
## Kappa : 0.0075
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.1425
## Specificity : 0.8646
## Pos Pred Value : 0.4746
## Neg Pred Value : 0.5402
## Prevalence : 0.4618
## Detection Rate : 0.0658
## Detection Prevalence : 0.1387
## Balanced Accuracy : 0.5036
##
## 'Positive' Class : 0
##
precision(test_data3$predicted_sex_binary, test_data3$Gender)
## [1] 0.8646288
recall(test_data3$predicted_sex_binary, test_data3$Gender)
## Warning in mean.default(predicted[actual == 1]): argument is not numeric or
## logical: returning NA
## [1] NA
roc2 <- roc(test_data3$Gender, test_data3$predicted_sex_binary)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc2, main = "ROC curve", print.auc = TRUE)
FA also does not give us promising results for logistic regression to predict gender. Dimentionality reduction techniques did not make the classification model better. PCA and FA both failed to perform.
- Linear regression model is decent with a MAPE accuracy of 15%. The residual analysis shows that there are discernable patterns in the data even after taking a log transformation of the dataset. the qq plot shows that there is a tail indicating that it does not follow a perfect normal distribution.
Attributes proved valuable in predicting the age are: 1. Shell weight 2. Height 3. Diameter There is also rings with a 1.00 correlation because it is just rings + 1.5 so that does not count.
- The MAPE accuracy on the Rings column is 15%.
- We can predict the gender but it is barely better than a coin flip. I have used various accuracy measures like AUC, ROC and confusion matrix. Logistic regression accuracy: 55.58% LDA: 55.93%
- No, it dimention reduction does not improve our accuracy. PCA: 53.35% FA: 53.11%